{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/emotion/emotion-twitter-lexicon.json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "os.environ['CUDA_VISIBLE_DEVICES'] = '1'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2023-10-06 13:58:28.904459: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA\n",
      "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
      "2023-10-06 13:58:28.983530: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
      "2023-10-06 13:58:29.396327: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory\n",
      "2023-10-06 13:58:29.396363: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory\n",
      "2023-10-06 13:58:29.396367: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.\n",
      "/home/husein/dev/malaya/malaya/tokenizer.py:214: FutureWarning: Possible nested set at position 3397\n",
      "  self.tok = re.compile(r'({})'.format('|'.join(pipeline)))\n",
      "/home/husein/dev/malaya/malaya/tokenizer.py:214: FutureWarning: Possible nested set at position 3927\n",
      "  self.tok = re.compile(r'({})'.format('|'.join(pipeline)))\n"
     ]
    }
   ],
   "source": [
    "from tqdm import tqdm\n",
    "import json\n",
    "import random\n",
    "import torch\n",
    "import numpy as np\n",
    "from transformers import AutoTokenizer, T5Config\n",
    "from malaya.torch_model.t5 import T5ForSequenceClassification"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "dict_keys(['anger', 'fear', 'happy', 'love', 'sadness', 'surprise'])"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "emotion_label = ['anger', 'fear', 'happy', 'love', 'sadness', 'surprise']\n",
    "\n",
    "with open('emotion-twitter-lexicon.json') as fopen:\n",
    "    emotion = json.load(fopen)\n",
    "    \n",
    "emotion.keys()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "anger 30000\n",
      "fear 20316\n",
      "happy 30000\n",
      "love 20783\n",
      "sadness 26468\n",
      "surprise 13107\n"
     ]
    }
   ],
   "source": [
    "texts, labels = [], []\n",
    "\n",
    "for k, v in emotion.items():\n",
    "    if len(v) > 30000:\n",
    "        emotion[k] = random.sample(v, 30000)\n",
    "    print(k, len(emotion[k]))\n",
    "    texts.extend(emotion[k])\n",
    "    labels.extend([emotion_label.index(k)] * len(emotion[k]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████████| 140674/140674 [00:00<00:00, 4090779.71it/s]\n"
     ]
    }
   ],
   "source": [
    "actual_t, actual_l = [], []\n",
    "\n",
    "for i in tqdm(range(len(texts))):\n",
    "    if len(texts[i]) > 2:\n",
    "        actual_t.append(texts[i])\n",
    "        actual_l.append(labels[i])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "train_X, test_X, train_Y, test_Y = train_test_split(\n",
    "    actual_t, actual_l, test_size = 0.2\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "6"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(set(actual_l))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "039bd7e1fa964b86a14769c159b37183",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading (…)lve/main/config.json:   0%|          | 0.00/790 [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "config = T5Config.from_pretrained('mesolitica/nanot5-small-malaysian-cased')\n",
    "config.num_labels = len(set(actual_l))\n",
    "config.vocab = list(emotion.keys())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "e95d4b7ba6f04f6f9fd0ddd718a153c0",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading pytorch_model.bin:   0%|          | 0.00/358M [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Some weights of T5ForSequenceClassification were not initialized from the model checkpoint at mesolitica/nanot5-small-malaysian-cased and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
     ]
    }
   ],
   "source": [
    "model = T5ForSequenceClassification.from_pretrained('mesolitica/nanot5-small-malaysian-cased', config = config)\n",
    "_ = model.cuda()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Loading the tokenizer from the `special_tokens_map.json` and the `added_tokens.json` will be removed in `transformers 5`,  it is kept for forward compatibility, but it is recommended to update your `tokenizer_config.json` by uploading it again. You will see the new `added_tokens_decoder` attribute that will store the relevant information.\n",
      "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
     ]
    }
   ],
   "source": [
    "tokenizer = AutoTokenizer.from_pretrained('mesolitica/nanot5-base-malaysian-cased')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "trainable_parameters = [param for param in model.parameters() if param.requires_grad]\n",
    "trainer = torch.optim.AdamW(trainable_parameters, lr = 2e-4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|███████████████████████████████████████| 7034/7034 [05:40<00:00, 20.68it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "epoch: 0, loss: 0.5093705902411132, dev_predicted: 0.9211199545196134\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|███████████████████████████████████████| 7034/7034 [06:18<00:00, 18.60it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "epoch: 1, loss: 0.2539201259537832, dev_predicted: 0.9413018760659465\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|███████████████████████████████████████| 7034/7034 [06:13<00:00, 18.82it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "epoch: 2, loss: 0.2022606245867862, dev_predicted: 0.9502202956225128\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|███████████████████████████████████████| 7034/7034 [06:18<00:00, 18.60it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "epoch: 3, loss: 0.17207835702439594, dev_predicted: 0.9575042637862422\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|███████████████████████████████████████| 7034/7034 [06:13<00:00, 18.83it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "epoch: 4, loss: 0.14899513813516257, dev_predicted: 0.9607021034678794\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|███████████████████████████████████████| 7034/7034 [05:16<00:00, 22.23it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "epoch: 5, loss: 0.13533089537048257, dev_predicted: 0.9631893121091529\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|███████████████████████████████████████| 7034/7034 [03:27<00:00, 33.84it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "epoch: 6, loss: 0.12416590529366782, dev_predicted: 0.9643973848777715\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|███████████████████████████████████████| 7034/7034 [03:27<00:00, 33.97it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "epoch: 7, loss: 0.11610092148560636, dev_predicted: 0.9678794769755543\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|███████████████████████████████████████| 7034/7034 [03:26<00:00, 34.04it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "epoch: 8, loss: 0.10486150778757097, dev_predicted: 0.9689098919840818\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|███████████████████████████████████████| 7034/7034 [03:26<00:00, 34.09it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "epoch: 9, loss: 0.1024017584425029, dev_predicted: 0.969148460976204\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|███████████████████████████████████████| 7034/7034 [03:25<00:00, 34.18it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "epoch: 10, loss: 0.09778238821261064, dev_predicted: 0.9703311540648095\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|███████████████████████████████████████| 7034/7034 [03:26<00:00, 34.13it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "epoch: 11, loss: 0.09108099877250556, dev_predicted: 0.969300739056282\n"
     ]
    }
   ],
   "source": [
    "batch_size = 16\n",
    "epoch = 100\n",
    "\n",
    "best_dev_acc = -np.inf\n",
    "patient = 1\n",
    "current_patient = 0\n",
    "\n",
    "for e in range(epoch):\n",
    "    pbar = tqdm(range(0, len(train_X), batch_size))\n",
    "    losses = []\n",
    "    for i in pbar:\n",
    "        trainer.zero_grad()\n",
    "        x = train_X[i: i + batch_size]\n",
    "        y = np.array(train_Y[i: i + batch_size])\n",
    "        \n",
    "        padded = tokenizer(x, padding = 'longest', return_tensors = 'pt')\n",
    "        padded['labels'] = torch.from_numpy(y)\n",
    "        for k in padded.keys():\n",
    "            padded[k] = padded[k].cuda()\n",
    "            \n",
    "        loss, pred = model(**padded)\n",
    "        loss.backward()\n",
    "        \n",
    "        grad_norm = torch.nn.utils.clip_grad_norm_(trainable_parameters, 5.0)\n",
    "        trainer.step()\n",
    "        losses.append(float(loss))\n",
    "        \n",
    "    dev_predicted = []\n",
    "    for i in range(0, len(test_X), batch_size):\n",
    "        x = test_X[i: i + batch_size]\n",
    "        y = np.array(test_Y[i: i + batch_size])\n",
    "        padded = tokenizer(x, padding = 'longest', return_tensors = 'pt')\n",
    "        padded['labels'] = torch.from_numpy(y)\n",
    "        for k in padded.keys():\n",
    "            padded[k] = padded[k].cuda()\n",
    "        \n",
    "        loss, pred = model(**padded)\n",
    "        dev_predicted.append((pred.argmax(axis = 1).detach().cpu().numpy() == y).mean())\n",
    "        \n",
    "    dev_predicted = np.mean(dev_predicted)\n",
    "    \n",
    "    print(f'epoch: {e}, loss: {np.mean(losses)}, dev_predicted: {dev_predicted}')\n",
    "    \n",
    "    if dev_predicted >= best_dev_acc:\n",
    "        best_dev_acc = dev_predicted\n",
    "        current_patient = 0\n",
    "        model.save_pretrained('small')\n",
    "    else:\n",
    "        current_patient += 1\n",
    "    \n",
    "    if current_patient >= patient:\n",
    "        break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "model_ = T5ForSequenceClassification.from_pretrained('small')\n",
    "_ = model_.cuda()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|███████████████████████████████████████| 1759/1759 [00:17<00:00, 98.97it/s]\n"
     ]
    }
   ],
   "source": [
    "real_Y = []\n",
    "for i in tqdm(range(0, len(test_X), batch_size)):\n",
    "    x = test_X[i: i + batch_size]\n",
    "    y = np.array(test_Y[i: i + batch_size])\n",
    "    padded = tokenizer(x, padding = 'longest', return_tensors = 'pt')\n",
    "    padded['labels'] = torch.from_numpy(y)\n",
    "    for k in padded.keys():\n",
    "        padded[k] = padded[k].cuda()\n",
    "\n",
    "    loss, pred = model(**padded)\n",
    "    real_Y.extend(pred.argmax(axis = 1).detach().cpu().numpy().tolist())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "              precision    recall  f1-score   support\n",
      "\n",
      "       anger    0.95647   0.95266   0.95456      6020\n",
      "        fear    0.95685   0.97154   0.96414      3971\n",
      "       happy    0.99236   0.98256   0.98744      6079\n",
      "        love    0.94842   0.96704   0.95764      4126\n",
      "     sadness    0.98210   0.96899   0.97550      5321\n",
      "    surprise    0.97263   0.97746   0.97504      2618\n",
      "\n",
      "    accuracy                        0.96929     28135\n",
      "   macro avg    0.96814   0.97004   0.96905     28135\n",
      "weighted avg    0.96945   0.96929   0.96933     28135\n",
      "\n"
     ]
    }
   ],
   "source": [
    "from sklearn import metrics\n",
    "\n",
    "print(\n",
    "    metrics.classification_report(\n",
    "        real_Y, test_Y, target_names = ['anger', 'fear', 'happy', 'love', 'sadness', 'surprise'],\n",
    "        digits = 5\n",
    "    )\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "00a4f10fb87c4920b80b7dec80a41a26",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "model.safetensors:   0%|          | 0.00/167M [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "CommitInfo(commit_url='https://huggingface.co/mesolitica/emotion-analysis-nanot5-small-malaysian-cased/commit/5b7e1a6e54ea155127a333c8978684d7bf551f21', commit_message='Upload T5ForSequenceClassification', commit_description='', oid='5b7e1a6e54ea155127a333c8978684d7bf551f21', pr_url=None, pr_revision=None, pr_num=None)"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model_.push_to_hub('mesolitica/emotion-analysis-nanot5-small-malaysian-cased', safe_serialization = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "CommitInfo(commit_url='https://huggingface.co/mesolitica/emotion-analysis-nanot5-small-malaysian-cased/commit/63d2ab0734afbef58b902c0d75a7d9d1ea49b090', commit_message='Upload tokenizer', commit_description='', oid='63d2ab0734afbef58b902c0d75a7d9d1ea49b090', pr_url=None, pr_revision=None, pr_num=None)"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer.push_to_hub('mesolitica/emotion-analysis-nanot5-small-malaysian-cased', safe_serialization = True)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
